Data Inspection¶
In [1]:
import plotly
plotly.offline.init_notebook_mode()
In [9]:
import pandas as pd
from tqdm import tqdm
df_list = list()
chunk_iter = pd.read_csv(
"../data/Total_Data_10Y_Top24.csv",
chunksize=100000,
dtype = {"CANCELLATION_CODE": str}
)
for chunk in tqdm(chunk_iter):
df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:27, 4.66it/s]
Out[9]:
| FL_DATE | OP_UNIQUE_CARRIER | TAIL_NUM | ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEST | ... | TAXI_IN | ARR_TIME | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | AA | N002AA | 1105703 | 31057 | CLT | Charlotte, NC | 1129803 | 30194 | DFW | ... | 28.0 | 1214.0 | 9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2014-07-01 | AA | N002AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1105703 | 31057 | CLT | ... | 13.0 | 945.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2014-07-01 | AA | N004AA | 1039705 | 30397 | ATL | Atlanta, GA | 1129803 | 30194 | DFW | ... | 6.0 | 1341.0 | -9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 7.0 | 1159.0 | 4.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 6.0 | 2317.0 | 2.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 24 columns
In [10]:
df.describe()
Out[10]:
| ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEP_TIME | DEP_DELAY | TAXI_OUT | TAXI_IN | ARR_TIME | ARR_DELAY | CANCELLED | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.292656e+07 | 1.292656e+07 | 1.292656e+07 | 1.292656e+07 | 1.271494e+07 | 1.271486e+07 | 1.271048e+07 | 1.270744e+07 | 1.270744e+07 | 1.268139e+07 | 1.292656e+07 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 |
| mean | 1.298824e+06 | 3.161338e+04 | 1.298812e+06 | 3.161370e+04 | 1.332409e+03 | 1.127152e+01 | 1.722694e+01 | 8.678440e+00 | 1.471801e+03 | 5.409690e+00 | 1.675551e-02 | 2.041558e+01 | 2.650336e+00 | 1.520350e+01 | 1.268451e-01 | 2.413592e+01 |
| std | 1.453419e+05 | 1.168922e+03 | 1.453231e+05 | 1.168763e+03 | 5.192380e+02 | 4.514114e+01 | 9.141093e+00 | 6.704624e+00 | 5.554134e+02 | 4.738376e+01 | 1.283540e-01 | 5.935956e+01 | 2.208897e+01 | 3.191662e+01 | 3.210375e+00 | 5.163883e+01 |
| min | 1.039705e+06 | 3.019400e+04 | 1.039705e+06 | 3.019400e+04 | 1.000000e+00 | -2.340000e+02 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | -2.380000e+02 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 1.129806e+06 | 3.046600e+04 | 1.129806e+06 | 3.046600e+04 | 9.060000e+02 | -5.000000e+00 | 1.200000e+01 | 5.000000e+00 | 1.052000e+03 | -1.400000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 50% | 1.289208e+06 | 3.145400e+04 | 1.289208e+06 | 3.145400e+04 | 1.324000e+03 | -1.000000e+00 | 1.500000e+01 | 7.000000e+00 | 1.515000e+03 | -5.000000e+00 | 0.000000e+00 | 3.000000e+00 | 0.000000e+00 | 2.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 75% | 1.410702e+06 | 3.245700e+04 | 1.410702e+06 | 3.245700e+04 | 1.751000e+03 | 9.000000e+00 | 2.000000e+01 | 1.000000e+01 | 1.930000e+03 | 9.000000e+00 | 0.000000e+00 | 2.000000e+01 | 0.000000e+00 | 1.900000e+01 | 0.000000e+00 | 2.800000e+01 |
| max | 1.501606e+06 | 3.481900e+04 | 1.501606e+06 | 3.481900e+04 | 2.400000e+03 | 3.695000e+03 | 2.270000e+02 | 1.419000e+03 | 2.400000e+03 | 3.680000e+03 | 1.000000e+00 | 3.359000e+03 | 2.692000e+03 | 1.511000e+03 | 9.870000e+02 | 3.581000e+03 |
In [11]:
df.FL_DATE.max()
Out[11]:
'2024-06-30'
In [5]:
df.nunique()
Out[5]:
FL_DATE 3653 OP_UNIQUE_CARRIER 20 TAIL_NUM 9140 ORIGIN_AIRPORT_SEQ_ID 58 ORIGIN_CITY_MARKET_ID 20 ORIGIN 24 ORIGIN_CITY_NAME 24 DEST_AIRPORT_SEQ_ID 58 DEST_CITY_MARKET_ID 20 DEST 24 DEST_CITY_NAME 24 DEP_TIME 1440 DEP_DELAY 1783 TAXI_OUT 195 TAXI_IN 247 ARR_TIME 1440 ARR_DELAY 1806 CANCELLED 2 CANCELLATION_CODE 4 CARRIER_DELAY 1558 WEATHER_DELAY 971 NAS_DELAY 855 SECURITY_DELAY 268 LATE_AIRCRAFT_DELAY 1228 dtype: int64
In [6]:
df.dtypes
Out[6]:
FL_DATE object OP_UNIQUE_CARRIER object TAIL_NUM object ORIGIN_AIRPORT_SEQ_ID int64 ORIGIN_CITY_MARKET_ID int64 ORIGIN object ORIGIN_CITY_NAME object DEST_AIRPORT_SEQ_ID int64 DEST_CITY_MARKET_ID int64 DEST object DEST_CITY_NAME object DEP_TIME float64 DEP_DELAY float64 TAXI_OUT float64 TAXI_IN float64 ARR_TIME float64 ARR_DELAY float64 CANCELLED float64 CANCELLATION_CODE object CARRIER_DELAY float64 WEATHER_DELAY float64 NAS_DELAY float64 SECURITY_DELAY float64 LATE_AIRCRAFT_DELAY float64 dtype: object
Data Cleaning¶
In [7]:
MEDIUM_AIRPORT_CODE = [
"DAL",
"PDX",
"STL",
"RDU",
"HOU",
"SMF",
"MSY",
"SJC",
"SJU",
"SNA"
]
In [12]:
import matplotlib.pyplot as plt
target_df = df.drop(columns = [
"OP_UNIQUE_CARRIER",
"ORIGIN_AIRPORT_SEQ_ID",
"ORIGIN_CITY_MARKET_ID",
"ORIGIN",
"ORIGIN_CITY_NAME",
"ORIGIN_AIRPORT_SEQ_ID",
"DEST_AIRPORT_SEQ_ID",
"DEST_CITY_MARKET_ID",
"DEST",
"DEST_CITY_NAME",
"DEST_AIRPORT_SEQ_ID",
"DEP_TIME",
"TAXI_OUT",
"TAXI_IN",
"ARR_TIME",
])
del df
target_df['FL_DATE'] = pd.to_datetime(target_df['FL_DATE'])
target_df.isnull().mean()
Out[12]:
FL_DATE 0.000000 TAIL_NUM 0.004830 DEP_DELAY 0.016377 ARR_DELAY 0.018966 CANCELLED 0.000000 CANCELLATION_CODE 0.983244 CARRIER_DELAY 0.801989 WEATHER_DELAY 0.801989 NAS_DELAY 0.801989 SECURITY_DELAY 0.801989 LATE_AIRCRAFT_DELAY 0.801989 dtype: float64
In [17]:
plane_set_df = target_df.TAIL_NUM.drop_duplicates()
plane_review = pd.read_csv("../supplementary/N-Number-Registration-Data-2024.csv")
plane_review['N-NUMBER'] = plane_review['N-NUMBER'].apply(lambda x:"N"+x)
plane_review = plane_review[plane_review['N-NUMBER'].isin(plane_set_df)]
plane_review
Out[17]:
| N-NUMBER | COUNTRY | TYPE AIRCRAFT | TYPE ENGINE | MFR | MODEL | NO-ENG | NO-SEATS | AC-WEIGHT | MFR_ENGINE | MODEL_ENGINE | HORSEPOWER | THRUST | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 651 | N10156 | US | Fixed wing multi engine | Turbo-fan | EMBRAER | EMB-145XR | 2 | 55 | 12,500 - 19,999 | ROLLS-ROYC | AE3007 SER | 0.0 | 6442.0 |
| 813 | N101DQ | US | Fixed wing multi engine | Turbo-fan | AIRBUS | A321-211 | 2 | 199 | 20,000 and over | CFM INTL | CFM56-5B3/3 | 0.0 | 32000.0 |
| 816 | N101DU | US | Fixed wing multi engine | Turbo-fan | C SERIES AIRCRAFT LTD PTNRSP | BD-500-1A10 | 2 | 133 | 20,000 and over | P & W | PW1519G | 0.0 | 19775.0 |
| 874 | N101HQ | US | Fixed wing multi engine | Turbo-fan | EMBRAER-EMPRESA BRASILEIRA DE | ERJ 170-200 LR | 2 | 80 | 20,000 and over | GE | CF34-8E5 | 0.0 | 14510.0 |
| 946 | N101NN | US | Fixed wing multi engine | Turbo-fan | AIRBUS INDUSTRIE | A321-231 | 2 | 379 | 20,000 and over | IAE | V2500SERIES | 0.0 | 25000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 297051 | N998AN | US | Fixed wing multi engine | Turbo-fan | AIRBUS | A321-231 | 2 | 379 | 20,000 and over | IAE | V2533-A5 | 0.0 | 31600.0 |
| 297052 | N998AT | US | Fixed wing multi engine | Turbo-fan | BOEING | 717-200 | 2 | 100 | 20,000 and over | ROLLS-ROYC | TAY 651-54 | 0.0 | 15400.0 |
| 297102 | N998JE | US | Fixed wing multi engine | Turbo-fan | AIRBUS | A321-231 | 2 | 379 | 20,000 and over | IAE | V2533-A5 | 0.0 | 31600.0 |
| 297126 | N998NN | US | Fixed wing multi engine | Turbo-fan | BOEING | 737-800 | 2 | 175 | 20,000 and over | CFM INTL | CFM56-7B24E | 0.0 | 24200.0 |
| 297463 | N999JQ | US | Fixed wing multi engine | Turbo-fan | AIRBUS | A321-231 | 2 | 379 | 20,000 and over | IAE | V2533-A5 | 0.0 | 31600.0 |
6798 rows × 13 columns
In [16]:
airport_review_count = airport_review.groupby("AIRPORT_CODE").size().reset_index().rename(columns = {0:"reviewCount"})
airport_review_merged = pd.merge(airport_review_count, airport_default_score, how = "inner", on = "AIRPORT_CODE")
airport_review_merged['Large'] = ~airport_review_merged.AIRPORT_CODE.isin(MEDIUM_AIRPORT_CODE)
airport_review_merged.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[16], line 1 ----> 1 airport_review_count = airport_review.groupby("AIRPORT_CODE").size().reset_index().rename(columns = {0:"reviewCount"}) 2 airport_review_merged = pd.merge(airport_review_count, airport_default_score, how = "inner", on = "AIRPORT_CODE") 3 airport_review_merged['Large'] = ~airport_review_merged.AIRPORT_CODE.isin(MEDIUM_AIRPORT_CODE) NameError: name 'airport_review' is not defined
In [18]:
target_airport_df = pd.merge(
target_df,
plane_review,
how = "inner",
left_on = "TAIL_NUM",
right_on = "N-NUMBER",
suffixes = ["", "_origin"]
)
"""
target_airport_df.rename(columns = {
'reviewCount':'reviewCount_origin',
'ratingValue':'ratingValue_origin',
'queueTime':'queueTime_origin',
'terminalCleanliness':'terminalCleanliness_origin',
'terminalSeating':'terminalSeating_origin',
'terminalSign':'terminalSign_origin',
'foodBeverage':'foodBeverage_origin',
'airportShopping':'airportShopping_origin',
'wifiConnectivity':'wifiConnectivity_origin',
'airportStaff':'airportStaff_origin',
'sentiment':'sentiment_origin',
'Large':'Large_origin'
}, inplace = True)
"""
target_airport_df.drop(columns = ['N-NUMBER'], inplace = True)
target_airport_df['CANCELLATION_CODE'] = target_airport_df['CANCELLATION_CODE'].fillna("Not")
target_airport_df.head()
Out[18]:
| FL_DATE | TAIL_NUM | DEP_DELAY | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | ... | TYPE ENGINE | MFR | MODEL | NO-ENG | NO-SEATS | AC-WEIGHT | MFR_ENGINE | MODEL_ENGINE | HORSEPOWER | THRUST | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | N200AA | 3.0 | -13.0 | 0.0 | Not | NaN | NaN | NaN | NaN | ... | Turbo-prop | TEXTRON AVIATION INC | 208B | 1 | 12 | Up to 12,499 | P&W CANADA | PT6A-140 | 867.0 | 0.0 |
| 1 | 2014-07-01 | N200AA | 5.0 | 1.0 | 0.0 | Not | NaN | NaN | NaN | NaN | ... | Turbo-prop | TEXTRON AVIATION INC | 208B | 1 | 12 | Up to 12,499 | P&W CANADA | PT6A-140 | 867.0 | 0.0 |
| 2 | 2014-07-01 | N201AA | -5.0 | -3.0 | 0.0 | Not | NaN | NaN | NaN | NaN | ... | Reciprocating | CESSNA | 150 | 1 | 2 | Up to 12,499 | CONT MOTOR | 0-200 SERIES | 100.0 | 0.0 |
| 3 | 2014-07-01 | N201AA | -5.0 | 0.0 | 0.0 | Not | NaN | NaN | NaN | NaN | ... | Reciprocating | CESSNA | 150 | 1 | 2 | Up to 12,499 | CONT MOTOR | 0-200 SERIES | 100.0 | 0.0 |
| 4 | 2014-07-01 | N201AA | -1.0 | -2.0 | 0.0 | Not | NaN | NaN | NaN | NaN | ... | Reciprocating | CESSNA | 150 | 1 | 2 | Up to 12,499 | CONT MOTOR | 0-200 SERIES | 100.0 | 0.0 |
5 rows × 23 columns
Visualization¶
Delay Trend¶
In [17]:
import plotly.express as px
delay_by_year = target_airport_df[[
'FL_DATE',
"DEP_DELAY",
'CARRIER_DELAY',
'WEATHER_DELAY',
'NAS_DELAY',
'SECURITY_DELAY',
'LATE_AIRCRAFT_DELAY',
]]
delay_by_year['Year'] = delay_by_year['FL_DATE'].dt.year
delay_by_year = delay_by_year.groupby("Year")[[
"DEP_DELAY",
'CARRIER_DELAY',
'WEATHER_DELAY',
'NAS_DELAY',
'SECURITY_DELAY',
'LATE_AIRCRAFT_DELAY',
]].mean().reset_index()
delay_by_year = delay_by_year.melt(id_vars = "Year", value_name = "Delay (m)")
fig = px.area(
delay_by_year,
x="Year",
y="Delay (m)",
color ='variable',
title='Airport Delay Trend in the Past 10Y',
)
fig.update_layout(
height=600,
)
fig.show()
C:\Users\wongh\AppData\Local\Temp\ipykernel_5332\142654084.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Airport Review¶
First of all, let's have a look at the Review Data we collected
In [59]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
corr_matrix = airport_review_merged.loc[:, 'reviewCount':].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', mask = mask)
plt.title('Airport Review Score Correlation')
plt.show()
Summary
The majority of the score given by users are very correlated,
with the exception of reviewCount of an airport, which inversely correlate with all other attributes.
Airport Ranking¶
In [60]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
In [61]:
fig = go.Figure()
fig = px.scatter(
data_frame = airport_review_merged,
x='sentiment',
y='ratingValue',
color='ratingValue',
size = 'reviewCount',
color_continuous_scale = 'viridis',
facet_col = "Large",
text='AIRPORT_CODE',
log_y = True
)
fig.update_traces(textposition='top center')
fig.for_each_annotation(
lambda a: a.update(text= "Medium Airport" if a.text == "Large=False" else "Large Airport")
)
fig.update_layout(
height=600,
width=1000,
title_text="Airport Sentiment Visualization",
showlegend=False
)
fig.show()
Review Performance per Airport¶
In [62]:
df_columns = [
'queueTime',
'terminalCleanliness',
'terminalSeating',
'terminalSign',
'foodBeverage',
'airportShopping',
'wifiConnectivity',
'airportStaff',
]
fig = make_subplots(rows=4, cols=2, subplot_titles=df_columns)
col = [1, 2]*4
row = [1, 1, 2, 2, 3, 3, 4, 4]
for c, r, column in zip(col, row, df_columns):
airport_review_merged.sort_values(column, ascending = False, inplace = True)
trace = go.Bar(
x=airport_review_merged['AIRPORT_CODE'],
y=airport_review_merged[column],
marker=dict(
color = airport_review_merged[column],
colorscale='viridis'
)
)
fig.add_trace(
trace,
row=r,
col=c
)
fig.update_layout(
height=1000,
title_text="Individual Scoring Attributes of Airports",
showlegend=False
)
fig.show()
Delay vs Ranking¶
In [21]:
target_airport_df.dtypes
Out[21]:
FL_DATE datetime64[ns] TAIL_NUM object DEP_DELAY float64 ARR_DELAY float64 CANCELLED float64 CANCELLATION_CODE object CARRIER_DELAY float64 WEATHER_DELAY float64 NAS_DELAY float64 SECURITY_DELAY float64 LATE_AIRCRAFT_DELAY float64 COUNTRY object TYPE AIRCRAFT object TYPE ENGINE object MFR object MODEL object NO-ENG int64 NO-SEATS int64 AC-WEIGHT object MFR_ENGINE object MODEL_ENGINE object HORSEPOWER float64 THRUST float64 dtype: object
In [25]:
target_airport_df.nunique()
Out[25]:
FL_DATE 3653 TAIL_NUM 6798 DEP_DELAY 1757 ARR_DELAY 1778 CANCELLED 2 CANCELLATION_CODE 5 CARRIER_DELAY 1525 WEATHER_DELAY 923 NAS_DELAY 820 SECURITY_DELAY 255 LATE_AIRCRAFT_DELAY 1212 COUNTRY 4 TYPE AIRCRAFT 5 TYPE ENGINE 8 MFR 101 MODEL 293 NO-ENG 7 NO-SEATS 73 AC-WEIGHT 4 MFR_ENGINE 33 MODEL_ENGINE 230 HORSEPOWER 55 THRUST 99 dtype: int64
In [22]:
train_columns = [
'NO-ENG',
'NO-SEATS',
'HORSEPOWER',
'THRUST'
]
label_columns = [
"DEP_DELAY",
"ARR_DELAY",
"CARRIER_DELAY",
"WEATHER_DELAY",
"NAS_DELAY",
"SECURITY_DELAY",
"LATE_AIRCRAFT_DELAY",
]
In [30]:
corr_matrix.head()
Out[30]:
| AC-WEIGHT_12,500 - 19,999 | AC-WEIGHT_20,000 and over | AC-WEIGHT_UAV up to 55 | AC-WEIGHT_Up to 12,499 | COUNTRY_ | COUNTRY_CA | COUNTRY_GB | COUNTRY_US | HORSEPOWER | NO-ENG | ... | TYPE AIRCRAFT_Rotorcraft | TYPE AIRCRAFT_Weight-shift-control | TYPE ENGINE_2 Cycle | TYPE ENGINE_4 Cycle | TYPE ENGINE_Electric | TYPE ENGINE_Reciprocating | TYPE ENGINE_Turbo-fan | TYPE ENGINE_Turbo-jet | TYPE ENGINE_Turbo-prop | TYPE ENGINE_Turbo-shaft | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NO-ENG | 0.002914 | 0.246786 | 0.187011 | -0.335489 | 0.046053 | 0.000558 | -0.052084 | -0.012037 | 0.006483 | 1.000000 | ... | 0.377504 | -0.070834 | -0.058386 | -0.188460 | 0.747054 | -0.444784 | 0.197079 | 0.005092 | -0.140875 | -0.089214 |
| NO-SEATS | -0.165297 | 0.304866 | -0.033574 | -0.252879 | -0.011352 | -0.007109 | -0.021893 | 0.020456 | -0.147924 | 0.079682 | ... | -0.098852 | -0.027984 | -0.023100 | -0.078553 | -0.064292 | -0.202713 | 0.176289 | -0.010630 | -0.104905 | -0.064378 |
| HORSEPOWER | -0.038858 | 0.032154 | NaN | -0.010212 | 0.036069 | -0.007996 | -0.000449 | -0.020415 | 1.000000 | 0.006483 | ... | -0.001564 | -0.004849 | -0.003961 | -0.009872 | NaN | -0.020284 | 0.031009 | -0.034264 | 0.027150 | 0.003418 |
| THRUST | -0.054376 | 0.173511 | NaN | -0.176758 | -0.027787 | 0.001945 | -0.002652 | 0.018845 | -0.751091 | 0.155881 | ... | -0.046561 | -0.019958 | -0.015674 | -0.052285 | NaN | -0.145564 | 0.101029 | 0.014589 | -0.080838 | -0.037308 |
| DEP_DELAY | 0.009573 | -0.005004 | -0.000896 | -0.000781 | -0.000230 | 0.000295 | 0.001135 | -0.000454 | 0.000385 | 0.000038 | ... | 0.000323 | -0.000802 | -0.000157 | -0.001803 | -0.000674 | -0.000369 | 0.003089 | -0.002669 | -0.001903 | 0.001144 |
5 rows × 25 columns
In [32]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
cat_columns = ['COUNTRY', 'TYPE AIRCRAFT', 'TYPE ENGINE', 'AC-WEIGHT']
num_columns = ['NO-ENG','NO-SEATS','HORSEPOWER','THRUST'] + label_columns
# Create dummy variables for categorical columns
dummy_df = pd.get_dummies(target_airport_df[cat_columns])
# Combine dummy variables with numerical columns
new_df = pd.concat([target_airport_df[num_columns], dummy_df], axis=1)
# Assuming the DataFrame is called 'df'
corr_matrix = new_df.corr()
corr_matrix = corr_matrix.loc[corr_matrix.columns.difference(label_columns), label_columns]
# Create a heatmap using Seaborn
sns.set(style="white")
plt.figure(figsize=(16,10))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title('Correlation Matrix')
plt.show()
Delay Data Size >0
In [64]:
cal_df = target_airport_df[['ORIGIN'] + label_columns]
cal_df.groupby('ORIGIN')[label_columns].apply(lambda x: (x > 0).mean()).reset_index()
Out[64]:
| ORIGIN | DEP_DELAY | ARR_DELAY | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|
| 0 | ATL | 0.391977 | 0.346680 | 0.102059 | 0.012486 | 0.098133 | 0.000559 | 0.070856 |
| 1 | CLT | 0.386671 | 0.377364 | 0.117649 | 0.012658 | 0.119450 | 0.002985 | 0.094161 |
| 2 | DAL | 0.511887 | 0.405451 | 0.138866 | 0.010460 | 0.092957 | 0.001003 | 0.135679 |
| 3 | DEN | 0.442719 | 0.386610 | 0.127630 | 0.010701 | 0.112139 | 0.000455 | 0.110875 |
| 4 | DFW | 0.417090 | 0.400279 | 0.137167 | 0.018804 | 0.118154 | 0.000864 | 0.105878 |
| 5 | EWR | 0.401861 | 0.362531 | 0.133538 | 0.011004 | 0.122859 | 0.000979 | 0.107049 |
| 6 | HOU | 0.461493 | 0.377129 | 0.120927 | 0.009179 | 0.084441 | 0.000970 | 0.115730 |
| 7 | JFK | 0.357083 | 0.337509 | 0.119247 | 0.010789 | 0.129858 | 0.001389 | 0.068300 |
| 8 | LAS | 0.400518 | 0.375535 | 0.111114 | 0.005143 | 0.108687 | 0.000699 | 0.113227 |
| 9 | LAX | 0.384964 | 0.363739 | 0.104485 | 0.004377 | 0.102161 | 0.000770 | 0.096527 |
| 10 | MCO | 0.390880 | 0.372180 | 0.110292 | 0.020802 | 0.120200 | 0.001602 | 0.106974 |
| 11 | MIA | 0.405724 | 0.393572 | 0.136413 | 0.019572 | 0.126643 | 0.001830 | 0.099147 |
| 12 | MSY | 0.360981 | 0.338699 | 0.087303 | 0.010217 | 0.093822 | 0.000673 | 0.106858 |
| 13 | ORD | 0.405576 | 0.382400 | 0.121694 | 0.021180 | 0.129221 | 0.000579 | 0.098953 |
| 14 | PDX | 0.298812 | 0.321973 | 0.066115 | 0.004676 | 0.084625 | 0.000397 | 0.068284 |
| 15 | PHX | 0.392451 | 0.362283 | 0.101458 | 0.003507 | 0.090475 | 0.001300 | 0.098925 |
| 16 | RDU | 0.344065 | 0.332790 | 0.086812 | 0.008671 | 0.104936 | 0.000614 | 0.091356 |
| 17 | SEA | 0.351422 | 0.362239 | 0.082688 | 0.006774 | 0.105163 | 0.000757 | 0.068566 |
| 18 | SFO | 0.369786 | 0.367693 | 0.098870 | 0.004128 | 0.108372 | 0.000497 | 0.107824 |
| 19 | SJC | 0.340300 | 0.339721 | 0.079905 | 0.003123 | 0.076541 | 0.000883 | 0.098579 |
| 20 | SJU | 0.347668 | 0.355347 | 0.117447 | 0.002182 | 0.102229 | 0.001834 | 0.121220 |
| 21 | SMF | 0.358445 | 0.343033 | 0.077687 | 0.003122 | 0.080243 | 0.000597 | 0.100190 |
| 22 | SNA | 0.352813 | 0.332351 | 0.068681 | 0.004160 | 0.083964 | 0.000328 | 0.098166 |
| 23 | STL | 0.390224 | 0.347997 | 0.098000 | 0.008861 | 0.093257 | 0.000460 | 0.101216 |
In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
corr_matrix = target_airport_df.loc[:, train_columns + label_columns].corr()
plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays')
plt.show()
plt.figure(figsize=(10, 5))
label_columns.remove('NAS_DELAY')
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays (Without NAS)')
plt.show()
In [14]:
import plotly.express as px
fig = px.box(
target_airport_df.sample(1000000, replace = False),
x="DEST",
y="NAS_DELAY",
log_y = True,
color = f"Large_dest",
color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
height=400,
title_text=f"Destination Aiport vs NAS Delay",
showlegend=True
)
fig.show()